# import libraries
import pandas as pd
import seaborn as sns
import missingno as msno
import statsmodels.formula.api as smf
import statsmodels.api as sm
import matplotlib.pyplot as plt
import matplotlib
import warnings
warnings.filterwarnings("ignore")
import numpy as np
import sklearn
import shap
from sklearn.model_selection import train_test_split
import plotly.express as px
# init shap
shap.initjs()
# change matplotlib style
plt.style.use('fivethirtyeight')
# set global font-size
matplotlib.rcParams.update({'font.size': 14})
# set font-scale for seaborn chart
sns.set(font_scale=1.5)
# change seaborn plot style
sns.set_style("whitegrid")
# read csv file
used_car_df = pd.read_csv('/Users/zhaoyudong/Downloads/vehicles.csv')
used_car_df.shape
(426880, 26)
used_car_df.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 426880 entries, 0 to 426879 Data columns (total 26 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 id 426880 non-null int64 1 url 426880 non-null object 2 region 426880 non-null object 3 region_url 426880 non-null object 4 price 426880 non-null int64 5 year 425675 non-null float64 6 manufacturer 409234 non-null object 7 model 421603 non-null object 8 condition 252776 non-null object 9 cylinders 249202 non-null object 10 fuel 423867 non-null object 11 odometer 422480 non-null float64 12 title_status 418638 non-null object 13 transmission 424324 non-null object 14 VIN 265838 non-null object 15 drive 296313 non-null object 16 size 120519 non-null object 17 type 334022 non-null object 18 paint_color 296677 non-null object 19 image_url 426812 non-null object 20 description 426810 non-null object 21 county 0 non-null float64 22 state 426880 non-null object 23 lat 420331 non-null float64 24 long 420331 non-null float64 25 posting_date 426812 non-null object dtypes: float64(5), int64(2), object(19) memory usage: 84.7+ MB
# check null value
used_car_df.isnull().sum(axis = 0)
id 0 url 0 region 0 region_url 0 price 0 year 1205 manufacturer 17646 model 5277 condition 174104 cylinders 177678 fuel 3013 odometer 4400 title_status 8242 transmission 2556 VIN 161042 drive 130567 size 306361 type 92858 paint_color 130203 image_url 68 description 70 county 426880 state 0 lat 6549 long 6549 posting_date 68 dtype: int64
# head 5 row
used_car_df.head()
| id | url | region | region_url | price | year | manufacturer | model | condition | cylinders | ... | size | type | paint_color | image_url | description | county | state | lat | long | posting_date | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 7222695916 | https://prescott.craigslist.org/cto/d/prescott... | prescott | https://prescott.craigslist.org | 6000 | NaN | NaN | NaN | NaN | NaN | ... | NaN | NaN | NaN | NaN | NaN | NaN | az | NaN | NaN | NaN |
| 1 | 7218891961 | https://fayar.craigslist.org/ctd/d/bentonville... | fayetteville | https://fayar.craigslist.org | 11900 | NaN | NaN | NaN | NaN | NaN | ... | NaN | NaN | NaN | NaN | NaN | NaN | ar | NaN | NaN | NaN |
| 2 | 7221797935 | https://keys.craigslist.org/cto/d/summerland-k... | florida keys | https://keys.craigslist.org | 21000 | NaN | NaN | NaN | NaN | NaN | ... | NaN | NaN | NaN | NaN | NaN | NaN | fl | NaN | NaN | NaN |
| 3 | 7222270760 | https://worcester.craigslist.org/cto/d/west-br... | worcester / central MA | https://worcester.craigslist.org | 1500 | NaN | NaN | NaN | NaN | NaN | ... | NaN | NaN | NaN | NaN | NaN | NaN | ma | NaN | NaN | NaN |
| 4 | 7210384030 | https://greensboro.craigslist.org/cto/d/trinit... | greensboro | https://greensboro.craigslist.org | 4900 | NaN | NaN | NaN | NaN | NaN | ... | NaN | NaN | NaN | NaN | NaN | NaN | nc | NaN | NaN | NaN |
5 rows × 26 columns
# backup dataset
used_car_df1 = used_car_df.copy()
# drop unralted columns
used_car_df = used_car_df.loc[:, ['price','year','manufacturer','condition','cylinders','fuel',
'odometer','transmission','drive','size','type','paint_color','state']]
# show matrix plot of different fields, check null values percentage
msno.matrix(used_car_df, figsize=(13, 5), color=(0,0,0.5))
<AxesSubplot:>
# drop null values
used_car_df = used_car_df.dropna(subset=['price','year','condition','manufacturer',
'cylinders','fuel','odometer','transmission',
'drive','paint_color','type','size'], how='any')
used_car_df.shape
(80170, 13)
used_car_df.head()
| price | year | manufacturer | condition | cylinders | fuel | odometer | transmission | drive | size | type | paint_color | state | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 31 | 15000 | 2013.0 | ford | excellent | 6 cylinders | gas | 128000.0 | automatic | rwd | full-size | truck | black | al |
| 55 | 19900 | 2004.0 | ford | good | 8 cylinders | diesel | 88000.0 | automatic | 4wd | full-size | pickup | blue | al |
| 59 | 14000 | 2012.0 | honda | excellent | 6 cylinders | gas | 95000.0 | automatic | fwd | full-size | mini-van | silver | al |
| 65 | 22500 | 2001.0 | ford | good | 8 cylinders | diesel | 144700.0 | manual | rwd | full-size | truck | white | al |
| 73 | 15000 | 2017.0 | dodge | excellent | 8 cylinders | gas | 90000.0 | automatic | rwd | mid-size | sedan | grey | al |
plt.boxplot(used_car_df.odometer)
plt.xlabel('odometer')
plt.show()
plt.boxplot(used_car_df.price)
plt.xlabel('price')
plt.show()
plt.hist(used_car_df['price'],bins=60)
plt.xlabel('price')
plt.title('Distribution of price')
plt.show()
plt.hist(used_car_df['odometer'],bins=60)
plt.xlabel('odometer')
plt.title('Distribution of odometer')
plt.show()
cols = ['price', 'odometer']
used_car_df[cols].describe()
| price | odometer | |
|---|---|---|
| count | 8.017000e+04 | 8.017000e+04 |
| mean | 7.953919e+04 | 1.248147e+05 |
| std | 1.379153e+07 | 2.494849e+05 |
| min | 0.000000e+00 | 0.000000e+00 |
| 25% | 4.950000e+03 | 7.300000e+04 |
| 50% | 9.000000e+03 | 1.140000e+05 |
| 75% | 1.750000e+04 | 1.554675e+05 |
| max | 3.736929e+09 | 1.000000e+07 |
# trim outlier
upper_limit = used_car_df[cols].quantile(0.99)
lower_limit = used_car_df[cols].quantile(0.01)
used_car_df_trimed = used_car_df[~(((used_car_df[cols] <= lower_limit)
|(used_car_df[cols] >= upper_limit)).any(axis=1))]
used_car_df_trimed.shape
(74476, 13)
### define a group of tools
def draw_multiple_price_boxplots(df, group_field, title='', xlabel='', filter_labels=[], orient="v"):
if len(filter_labels) != 0:
df = df[df[group_field].isin(filter_labels)]
size = (8, 8)
if len(df[group_field].unique()) >= 8:
size = (12, 8) if orient != 'h' else (8, 12)
fig,ax = plt.subplots(figsize=size)
x,y = group_field, "price"
if orient == 'h':
x,y = y,x
sns.boxplot(x=x, y=y, data=df, showfliers=False, ax=ax, orient=orient)
ax.set(
axisbelow=True, # Hide the grid behind plot objects
title=title
)
def _draw_bar_plot(df, field, title='', xlabel='', ylabel=''):
fig, ax = plt.subplots(figsize=(12,10))
ax.bar(df[field], df['price'], width=1, edgecolor="white", linewidth=0.7)
ax.set(
title=title,
xlabel=xlabel,
ylabel=ylabel
)
plt.show()
def draw_bar_plot(df, group_field, title='', xlabel='', ylabel=''):
grouped = df.groupby([group_field]).count()
grouped = grouped.reset_index()
_draw_bar_plot(grouped, group_field, title, xlabel, ylabel)
def draw_head_bar_plot(df, group_field, head=10, title='', xlabel='', ylabel=''):
grouped = df.groupby([group_field]).count()
grouped = grouped.reset_index()
grouped = grouped.head(head)
_draw_bar_plot(grouped, group_field, title, xlabel, ylabel)
def draw_pie_plot(df, group_field, title=''):
grouped = df[group_field].value_counts()/len(df) * 100
fig, ax = plt.subplots(figsize=(10,10))
ax.pie(grouped)
labels = [f'{l}, {s:0.1f}%' for l, s in zip(grouped.index, grouped)]
plt.legend(bbox_to_anchor=(1,0.5), loc="center right", labels=labels,
bbox_transform=plt.gcf().transFigure)
plt.savefig("output.png", bbox_inches="tight")
plt.subplots_adjust(left=0.0, bottom=0.2, right=0.9)
plt.title(title)
plt.show()
def draw_line_plot(x, y):
fig, ax = plt.subplots(figsize=(10,10))
ax.plot(x, y, linewidth=2.0)
plt.show()
# price hist
fig,ax = plt.subplots(figsize=(16,10))
sns.histplot(data=used_car_df_trimed, x="price", kde=True, ax=ax)
ax.set(title='Frequency Distribution of Price')
[Text(0.5, 1.0, 'Frequency Distribution of Price')]
# year and confition
fig,ax = plt.subplots(figsize=(16,10))
sns.histplot(data=used_car_df_trimed,
x="year", hue="condition",kde=True, multiple="stack", ax=ax)
ax.set(title='Frequency Distribution of Year')
[Text(0.5, 1.0, 'Frequency Distribution of Year')]
grouped_by_year_df = used_car_df_trimed.groupby(['year'])
grouped_by_year_df.price.describe()
| count | mean | std | min | 25% | 50% | 75% | max | |
|---|---|---|---|---|---|---|---|---|
| year | ||||||||
| 1900.0 | 1.0 | 38250.000000 | NaN | 38250.0 | 38250.0 | 38250.0 | 38250.0 | 38250.0 |
| 1905.0 | 1.0 | 3990.000000 | NaN | 3990.0 | 3990.0 | 3990.0 | 3990.0 | 3990.0 |
| 1916.0 | 1.0 | 12000.000000 | NaN | 12000.0 | 12000.0 | 12000.0 | 12000.0 | 12000.0 |
| 1918.0 | 1.0 | 16000.000000 | NaN | 16000.0 | 16000.0 | 16000.0 | 16000.0 | 16000.0 |
| 1923.0 | 5.0 | 20298.000000 | 4522.888458 | 15000.0 | 17500.0 | 18990.0 | 25000.0 | 25000.0 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 2017.0 | 3399.0 | 22906.838188 | 11897.517403 | 1.0 | 14896.5 | 20980.0 | 31500.0 | 55900.0 |
| 2018.0 | 2620.0 | 26285.083969 | 13062.719164 | 1.0 | 17450.0 | 25256.0 | 36490.0 | 56000.0 |
| 2019.0 | 1622.0 | 28541.700986 | 13234.619328 | 1.0 | 19995.0 | 29900.0 | 36700.0 | 56499.0 |
| 2020.0 | 745.0 | 30739.477852 | 12365.196406 | 1.0 | 21900.0 | 30000.0 | 38500.0 | 56000.0 |
| 2021.0 | 83.0 | 33243.554217 | 14177.420484 | 50.0 | 24995.0 | 32900.0 | 44750.0 | 56000.0 |
101 rows × 8 columns
used_car_df_trimed.odometer.describe()
count 74476.000000 mean 117839.591949 std 58083.058896 min 203.000000 25% 76278.000000 50% 115831.000000 75% 156000.000000 max 295747.000000 Name: odometer, dtype: float64
plt.boxplot(used_car_df_trimed.odometer)
plt.xlabel('odometer')
plt.show()
# odometer hist
fig,ax = plt.subplots(figsize=(16,10))
sns.histplot(data=used_car_df_trimed, x="odometer", kde=True, ax=ax)
ax.set(title='Frequency Distribution of Odometer')
[Text(0.5, 1.0, 'Frequency Distribution of Odometer')]
sns.jointplot(x=used_car_df_trimed['odometer'], y=used_car_df_trimed['price'], height=10, kind="hex", color="#4CB391")
<seaborn.axisgrid.JointGrid at 0x7feb45b36160>
used_car_df_trimed.condition.value_counts()
excellent 36758 good 24453 like new 9777 fair 2907 new 356 salvage 225 Name: condition, dtype: int64
used_car_df_trimed.condition.unique()
array(['excellent', 'good', 'new', 'fair', 'like new', 'salvage'],
dtype=object)
grouped_by_condition_df = used_car_df_trimed.groupby(['condition'])
grouped_by_condition_df.price.describe()
| count | mean | std | min | 25% | 50% | 75% | max | |
|---|---|---|---|---|---|---|---|---|
| condition | ||||||||
| excellent | 36758.0 | 14202.414903 | 10603.938666 | 1.0 | 6600.0 | 10995.0 | 18900.0 | 56000.0 |
| fair | 2907.0 | 3798.018232 | 3651.584091 | 1.0 | 1550.0 | 2800.0 | 4700.0 | 55000.0 |
| good | 24453.0 | 10782.673578 | 9567.596169 | 1.0 | 4400.0 | 7490.0 | 13900.0 | 56000.0 |
| like new | 9777.0 | 17956.718728 | 12579.254665 | 1.0 | 7995.0 | 14900.0 | 25750.0 | 56499.0 |
| new | 356.0 | 18389.994382 | 15387.959027 | 1.0 | 5995.0 | 13997.0 | 29050.0 | 55000.0 |
| salvage | 225.0 | 4045.231111 | 4590.818723 | 1.0 | 1000.0 | 2000.0 | 5900.0 | 25000.0 |
draw_pie_plot(used_car_df_trimed, 'condition', title='Piechart of Condition')
draw_multiple_price_boxplots(used_car_df_trimed, 'condition',
'Boxplots of price of different fuel', 'fuel')
sns.catplot(
data=used_car_df_trimed, kind="bar",
x="condition", y="price", ax=ax,
ci="sd", palette="dark", alpha=.6, height=8
)
<seaborn.axisgrid.FacetGrid at 0x7feb49f6f880>
used_car_df_trimed.manufacturer.unique()
array(['ford', 'honda', 'dodge', 'chrysler', 'toyota', 'jeep', 'lexus',
'chevrolet', 'bmw', 'gmc', 'mercedes-benz', 'mazda', 'rover',
'ram', 'nissan', 'audi', 'mitsubishi', 'infiniti', 'volkswagen',
'kia', 'hyundai', 'fiat', 'acura', 'cadillac', 'lincoln', 'jaguar',
'saturn', 'volvo', 'alfa-romeo', 'buick', 'subaru', 'mini',
'pontiac', 'harley-davidson', 'porsche', 'tesla', 'mercury',
'datsun', 'land rover', 'aston-martin', 'ferrari'], dtype=object)
used_car_df_trimed.manufacturer.value_counts()
ford 14383 chevrolet 11170 toyota 6549 honda 4657 nissan 3809 jeep 3064 gmc 2899 ram 2658 dodge 2357 bmw 2101 subaru 1901 mercedes-benz 1813 volkswagen 1717 hyundai 1667 kia 1315 chrysler 1309 lexus 1198 cadillac 1151 buick 1000 mazda 940 audi 867 pontiac 700 acura 669 lincoln 630 infiniti 615 mitsubishi 596 volvo 579 mini 411 mercury 390 rover 355 saturn 341 porsche 221 jaguar 208 fiat 125 harley-davidson 31 alfa-romeo 28 tesla 25 datsun 14 land rover 8 aston-martin 3 ferrari 2 Name: manufacturer, dtype: int64
grouped_by_manu_df = used_car_df_trimed.groupby(['manufacturer'])
grouped_by_manu_df.price.describe()
| count | mean | std | min | 25% | 50% | 75% | max | |
|---|---|---|---|---|---|---|---|---|
| manufacturer | ||||||||
| acura | 669.0 | 9385.112108 | 7204.492183 | 1.0 | 4700.00 | 7395.0 | 11999.00 | 46567.0 |
| alfa-romeo | 28.0 | 21795.964286 | 12485.364651 | 2000.0 | 9375.00 | 26800.0 | 28887.50 | 54998.0 |
| aston-martin | 3.0 | 42631.666667 | 9555.182276 | 32900.0 | 37947.50 | 42995.0 | 47497.50 | 52000.0 |
| audi | 867.0 | 13087.080738 | 8960.137121 | 1.0 | 6500.00 | 10999.0 | 16995.00 | 48000.0 |
| bmw | 2101.0 | 12371.571157 | 9771.248331 | 1.0 | 5900.00 | 9998.0 | 16350.00 | 55000.0 |
| buick | 1000.0 | 9640.080000 | 8413.417134 | 1.0 | 3950.00 | 6999.0 | 12995.00 | 50000.0 |
| cadillac | 1151.0 | 11637.825369 | 9539.912961 | 1.0 | 4999.00 | 9991.0 | 15000.00 | 53995.0 |
| chevrolet | 11170.0 | 14983.314772 | 11483.029943 | 1.0 | 5999.00 | 11750.0 | 21700.00 | 56000.0 |
| chrysler | 1309.0 | 7657.862490 | 5736.425782 | 1.0 | 3695.00 | 6250.0 | 9990.00 | 51500.0 |
| datsun | 14.0 | 13434.928571 | 7516.606082 | 2500.0 | 8248.00 | 15000.0 | 18800.00 | 26500.0 |
| dodge | 2357.0 | 10207.248197 | 9148.096780 | 1.0 | 4200.00 | 7500.0 | 12995.00 | 55900.0 |
| ferrari | 2.0 | 21500.500000 | 30404.884484 | 1.0 | 10750.75 | 21500.5 | 32250.25 | 43000.0 |
| fiat | 125.0 | 9099.544000 | 5357.066749 | 1.0 | 5500.00 | 7500.0 | 10950.00 | 32500.0 |
| ford | 14383.0 | 16129.381701 | 12363.794811 | 1.0 | 6500.00 | 12500.0 | 23495.00 | 56499.0 |
| gmc | 2899.0 | 18741.705761 | 13102.827331 | 1.0 | 8495.00 | 15000.0 | 27500.00 | 55680.0 |
| harley-davidson | 31.0 | 13801.612903 | 5332.392213 | 3000.0 | 11975.00 | 11975.0 | 17475.00 | 25950.0 |
| honda | 4657.0 | 8878.240713 | 6413.336155 | 1.0 | 4500.00 | 7000.0 | 11900.00 | 55000.0 |
| hyundai | 1667.0 | 8250.660468 | 5090.428855 | 1.0 | 4500.00 | 7400.0 | 10900.00 | 35995.0 |
| infiniti | 615.0 | 12112.582114 | 8990.761344 | 1.0 | 5800.00 | 9500.0 | 16997.00 | 52000.0 |
| jaguar | 208.0 | 11555.153846 | 9578.759443 | 1.0 | 5298.75 | 7900.0 | 14988.50 | 42995.0 |
| jeep | 3064.0 | 13772.554830 | 10399.795310 | 1.0 | 5500.00 | 11984.0 | 19900.00 | 54995.0 |
| kia | 1315.0 | 8650.286692 | 5685.056397 | 30.0 | 4900.00 | 7898.0 | 11000.00 | 55000.0 |
| land rover | 8.0 | 18116.875000 | 20783.590013 | 3950.0 | 6371.25 | 8247.5 | 19498.75 | 55000.0 |
| lexus | 1198.0 | 13183.651920 | 9573.382715 | 1.0 | 6861.25 | 10500.0 | 16998.00 | 51000.0 |
| lincoln | 630.0 | 9713.228571 | 7182.786736 | 1.0 | 4800.00 | 7797.5 | 13966.00 | 50985.0 |
| mazda | 940.0 | 8399.646809 | 5790.062169 | 1.0 | 4500.00 | 6925.0 | 9999.25 | 34500.0 |
| mercedes-benz | 1813.0 | 13116.517375 | 10631.268623 | 1.0 | 5850.00 | 9995.0 | 17995.00 | 55000.0 |
| mercury | 390.0 | 5601.289744 | 4978.931002 | 1.0 | 2756.25 | 3999.0 | 6495.00 | 35000.0 |
| mini | 411.0 | 9292.732360 | 5223.749597 | 1.0 | 5950.00 | 7995.0 | 10900.00 | 34595.0 |
| mitsubishi | 596.0 | 13272.395973 | 11462.055735 | 1.0 | 4150.00 | 7999.5 | 24990.00 | 39900.0 |
| nissan | 3809.0 | 9676.215280 | 6932.500816 | 1.0 | 4990.00 | 7995.0 | 12900.00 | 51000.0 |
| pontiac | 700.0 | 8142.174286 | 8275.697440 | 1.0 | 3500.00 | 4995.0 | 8525.00 | 46500.0 |
| porsche | 221.0 | 21236.963801 | 12373.857633 | 1.0 | 12000.00 | 18000.0 | 28000.00 | 56000.0 |
| ram | 2658.0 | 22670.181716 | 13884.901137 | 1.0 | 11000.00 | 20990.0 | 32995.00 | 56000.0 |
| rover | 355.0 | 18255.811268 | 13373.402910 | 1.0 | 8000.00 | 15500.0 | 23900.00 | 55812.0 |
| saturn | 341.0 | 4536.299120 | 2863.110247 | 300.0 | 2995.00 | 3995.0 | 5200.00 | 22000.0 |
| subaru | 1901.0 | 10301.986323 | 6855.840268 | 1.0 | 5499.00 | 8500.0 | 13990.00 | 43995.0 |
| tesla | 25.0 | 34804.400000 | 15983.478248 | 1.0 | 28500.00 | 36400.0 | 45000.00 | 54750.0 |
| toyota | 6549.0 | 12214.268133 | 9962.557170 | 1.0 | 5460.00 | 8900.0 | 15900.00 | 55000.0 |
| volkswagen | 1717.0 | 9278.158998 | 6120.498103 | 1.0 | 5000.00 | 7950.0 | 11995.00 | 40000.0 |
| volvo | 579.0 | 8188.341969 | 7972.708722 | 1.0 | 3550.00 | 5900.0 | 9850.00 | 52991.0 |
draw_head_bar_plot(used_car_df_trimed, 'manufacturer', title = 'Barchart of Manufacturer')
top_10_manufacturer = used_car_df_trimed.manufacturer.value_counts().head(10).index.tolist()
draw_multiple_price_boxplots(used_car_df_trimed, 'manufacturer',
'Boxplots of price of different manufacturer',
'manufacturer',filter_labels=top_10_manufacturer)
used_car_df_trimed.fuel.unique()
array(['gas', 'diesel', 'hybrid', 'electric', 'other'], dtype=object)
used_car_df_trimed.fuel.value_counts()
gas 68714 diesel 4633 hybrid 920 electric 117 other 92 Name: fuel, dtype: int64
grouped_by_fuel_df = used_car_df_trimed.groupby(['fuel'])
grouped_by_fuel_df.price.describe()
| count | mean | std | min | 25% | 50% | 75% | max | |
|---|---|---|---|---|---|---|---|---|
| fuel | ||||||||
| diesel | 4633.0 | 27146.822577 | 14315.667955 | 1.0 | 15000.0 | 26990.0 | 38900.00 | 56499.0 |
| electric | 117.0 | 18387.470085 | 14846.362473 | 1100.0 | 7200.0 | 12999.0 | 23500.00 | 54750.0 |
| gas | 68714.0 | 12243.165323 | 9891.205969 | 1.0 | 5000.0 | 8997.0 | 16500.00 | 56000.0 |
| hybrid | 920.0 | 10334.448913 | 7614.810203 | 1.0 | 5700.0 | 7998.5 | 12499.00 | 49900.0 |
| other | 92.0 | 11681.076087 | 8501.509141 | 5.0 | 4937.5 | 9900.0 | 15923.75 | 45000.0 |
used_car_df_trimed.fuel.value_counts().index
Index(['gas', 'diesel', 'hybrid', 'electric', 'other'], dtype='object')
draw_pie_plot(used_car_df_trimed,'fuel', "Piechart of Fuel Types")
draw_multiple_price_boxplots(used_car_df_trimed, 'fuel',
'Boxplots of price of different fuel', 'fuel')
sns.catplot(
data=used_car_df_trimed, kind="bar",
x="fuel", y="price", ax=ax,
ci="sd", palette="dark", alpha=.6, height=8
)
<seaborn.axisgrid.FacetGrid at 0x7feb4acd03a0>
used_car_df_trimed.drive.unique()
array(['rwd', '4wd', 'fwd'], dtype=object)
used_car_df_trimed.drive.value_counts()
4wd 30985 fwd 27000 rwd 16491 Name: drive, dtype: int64
grouped_by_drive_df = used_car_df_trimed.groupby(['drive'])
grouped_by_drive_df.price.describe()
| count | mean | std | min | 25% | 50% | 75% | max | |
|---|---|---|---|---|---|---|---|---|
| drive | ||||||||
| 4wd | 30985.0 | 16785.611522 | 12418.355071 | 1.0 | 7000.0 | 13900.0 | 23995.0 | 56499.0 |
| fwd | 27000.0 | 8184.754296 | 5911.396897 | 1.0 | 4200.0 | 6888.0 | 10495.0 | 56000.0 |
| rwd | 16491.0 | 14474.030562 | 10752.132758 | 1.0 | 6000.0 | 11500.0 | 20825.0 | 55995.0 |
draw_pie_plot(used_car_df_trimed, 'drive', "Piechart of Drive")
draw_bar_plot(used_car_df_trimed, 'drive', "Piechart of Drive")
draw_multiple_price_boxplots(used_car_df_trimed, 'drive',
'Boxplots of price of different drive', 'drive')
sns.catplot(
data=used_car_df_trimed, kind="bar",
x="drive", y="price", ax=ax,
ci="sd", palette="dark", alpha=.6, height=8
)
<seaborn.axisgrid.FacetGrid at 0x7feb4ae21700>
used_car_df_trimed.paint_color.unique()
array(['black', 'blue', 'silver', 'white', 'grey', 'yellow', 'red',
'green', 'brown', 'purple', 'custom', 'orange'], dtype=object)
used_car_df_trimed.paint_color.value_counts()
white 17963 black 13015 silver 10901 grey 8993 blue 7978 red 7507 green 2546 brown 2370 custom 1932 yellow 602 orange 423 purple 246 Name: paint_color, dtype: int64
grouped_by_color_df = used_car_df_trimed.groupby(['paint_color'])
grouped_by_color_df.price.describe()
| count | mean | std | min | 25% | 50% | 75% | max | |
|---|---|---|---|---|---|---|---|---|
| paint_color | ||||||||
| black | 13015.0 | 14352.902497 | 11278.745947 | 1.0 | 6200.0 | 10995.0 | 19000.00 | 56000.0 |
| blue | 7978.0 | 11125.945976 | 9245.276472 | 1.0 | 4900.0 | 7995.0 | 14900.00 | 55000.0 |
| brown | 2370.0 | 10074.659494 | 8849.902685 | 1.0 | 4062.5 | 7490.0 | 12990.00 | 52995.0 |
| custom | 1932.0 | 12265.525880 | 10355.845933 | 1.0 | 4999.0 | 8894.0 | 15995.00 | 55000.0 |
| green | 2546.0 | 9446.562058 | 9063.892368 | 1.0 | 3500.0 | 6200.0 | 11950.00 | 55950.0 |
| grey | 8993.0 | 12450.698543 | 9978.807525 | 1.0 | 5500.0 | 9300.0 | 16000.00 | 56000.0 |
| orange | 423.0 | 14776.439716 | 10962.077589 | 1.0 | 5999.5 | 11995.0 | 21000.00 | 55000.0 |
| purple | 246.0 | 9423.300813 | 9492.141901 | 200.0 | 3815.0 | 6000.0 | 10999.75 | 55000.0 |
| red | 7507.0 | 12494.775410 | 10633.082820 | 1.0 | 4995.0 | 8995.0 | 16500.00 | 55500.0 |
| silver | 10901.0 | 10948.618751 | 9228.395040 | 1.0 | 4900.0 | 7950.0 | 13999.00 | 56000.0 |
| white | 17963.0 | 16202.734398 | 12136.043584 | 1.0 | 6850.0 | 12988.0 | 23900.00 | 56499.0 |
| yellow | 602.0 | 13047.308970 | 9970.903523 | 1.0 | 5500.0 | 9997.5 | 18000.00 | 54500.0 |
draw_pie_plot(used_car_df_trimed,'paint_color', "Piechart of Paint Color")
draw_bar_plot(used_car_df_trimed,'paint_color', "Bar Chart of Paint Color")
draw_multiple_price_boxplots(used_car_df_trimed, 'paint_color',
'Boxplots of price of different paint color', 'paint_color')
sns.catplot(
data=used_car_df_trimed, kind="bar",
x="paint_color", y="price", ax=ax,
ci="sd", palette="dark", alpha=.6, height=8, aspect=1.5
)
<seaborn.axisgrid.FacetGrid at 0x7feb4b2e5700>
used_car_df_trimed.type.unique()
array(['truck', 'pickup', 'mini-van', 'sedan', 'offroad', 'SUV',
'convertible', 'coupe', 'hatchback', 'wagon', 'other', 'van',
'bus'], dtype=object)
used_car_df_trimed.type.value_counts()
sedan 20880 SUV 19061 truck 12182 pickup 6161 coupe 4082 hatchback 3041 van 2621 convertible 2134 mini-van 1794 wagon 1610 other 462 offroad 326 bus 122 Name: type, dtype: int64
grouped_by_type_df = used_car_df_trimed.groupby(['type'])
grouped_by_type_df.price.describe()
| count | mean | std | min | 25% | 50% | 75% | max | |
|---|---|---|---|---|---|---|---|---|
| type | ||||||||
| SUV | 19061.0 | 12611.693353 | 9354.211886 | 1.0 | 5975.00 | 9998.0 | 16888.00 | 56000.0 |
| bus | 122.0 | 12264.950820 | 8989.174139 | 175.0 | 4700.00 | 11500.0 | 18495.00 | 32500.0 |
| convertible | 2134.0 | 13840.753046 | 9813.212157 | 1.0 | 6800.00 | 11000.0 | 18500.00 | 55000.0 |
| coupe | 4082.0 | 12576.124449 | 10831.705343 | 1.0 | 4800.00 | 8990.0 | 17500.00 | 55995.0 |
| hatchback | 3041.0 | 8256.286748 | 5636.887320 | 1.0 | 4500.00 | 6900.0 | 10000.00 | 50000.0 |
| mini-van | 1794.0 | 8893.641026 | 7733.397366 | 1.0 | 3896.25 | 6200.0 | 11500.00 | 54000.0 |
| offroad | 326.0 | 16297.392638 | 11216.922923 | 13.0 | 7500.00 | 13500.0 | 23388.00 | 54500.0 |
| other | 462.0 | 10856.374459 | 10011.339976 | 1.0 | 3825.00 | 7997.5 | 14500.00 | 52500.0 |
| pickup | 6161.0 | 17535.816751 | 12220.757982 | 1.0 | 7995.00 | 14994.0 | 24995.00 | 56000.0 |
| sedan | 20880.0 | 8347.595307 | 6445.277854 | 1.0 | 4100.00 | 6950.0 | 10500.00 | 55900.0 |
| truck | 12182.0 | 22077.240108 | 13383.732300 | 1.0 | 10950.00 | 20925.0 | 31900.00 | 56499.0 |
| van | 2621.0 | 15982.635254 | 10370.203447 | 1.0 | 7495.00 | 13995.0 | 22487.00 | 55000.0 |
| wagon | 1610.0 | 7738.832919 | 6616.183921 | 1.0 | 3400.00 | 6498.5 | 10387.25 | 54000.0 |
draw_pie_plot(used_car_df_trimed,'type', "Pie Chart of Types")
draw_bar_plot(used_car_df_trimed,'type', "Bar Chart of Types")
draw_multiple_price_boxplots(used_car_df_trimed, 'type',
'Boxplots of price of different types', 'type', orient="h")
sns.catplot(
data=used_car_df_trimed, kind="bar",
x="type", y="price", ax=ax,
ci="sd", palette="dark", alpha=.6, height=8, aspect=1.5
)
<seaborn.axisgrid.FacetGrid at 0x7feb471388e0>
used_car_df_trimed.transmission.unique()
array(['automatic', 'manual', 'other'], dtype=object)
used_car_df_trimed.transmission.value_counts()
automatic 68046 manual 5561 other 869 Name: transmission, dtype: int64
grouped_by_transmission_df = used_car_df_trimed.groupby(['transmission'])
grouped_by_transmission_df.price.describe()
| count | mean | std | min | 25% | 50% | 75% | max | |
|---|---|---|---|---|---|---|---|---|
| transmission | ||||||||
| automatic | 68046.0 | 13367.647915 | 10880.182871 | 1.0 | 5500.0 | 9900.0 | 17995.0 | 56499.0 |
| manual | 5561.0 | 11426.911707 | 9733.070516 | 1.0 | 4500.0 | 8000.0 | 15900.0 | 55500.0 |
| other | 869.0 | 7619.966628 | 10950.993805 | 1.0 | 1.0 | 100.0 | 12500.0 | 50000.0 |
draw_pie_plot(used_car_df_trimed,'transmission', "PieChart of Transmission Types")
draw_multiple_price_boxplots(used_car_df_trimed, 'transmission',
'Boxplots of price of different transmission', 'transmission')
sns.catplot(
data=used_car_df_trimed, kind="bar",
x="transmission", y="price", ax=ax,
ci="sd", palette="dark", alpha=.6, height=8
)
<seaborn.axisgrid.FacetGrid at 0x7feb4a164b20>
used_car_df_trimed['size'].unique()
array(['full-size', 'mid-size', 'compact', 'sub-compact'], dtype=object)
used_car_df_trimed['size'].value_counts()
full-size 41399 mid-size 22212 compact 9575 sub-compact 1290 Name: size, dtype: int64
grouped_by_size_df = used_car_df_trimed.groupby(['size'])
grouped_by_size_df.price.describe()
| count | mean | std | min | 25% | 50% | 75% | max | |
|---|---|---|---|---|---|---|---|---|
| size | ||||||||
| compact | 9575.0 | 8684.537859 | 7133.975688 | 1.0 | 4000.00 | 6988.0 | 11000.0 | 55995.0 |
| full-size | 41399.0 | 15620.299838 | 12020.057764 | 1.0 | 6495.00 | 11999.0 | 22000.0 | 56499.0 |
| mid-size | 22212.0 | 10713.082433 | 8488.268715 | 1.0 | 4950.00 | 7999.0 | 13995.0 | 56000.0 |
| sub-compact | 1290.0 | 9305.075194 | 7306.001295 | 1.0 | 4599.25 | 6999.0 | 11999.0 | 55000.0 |
draw_pie_plot(used_car_df_trimed,'size', 'Piechart of Size')
draw_multiple_price_boxplots(used_car_df_trimed, 'size', 'Boxplots of price of different size')
sns.catplot(
data=used_car_df_trimed, kind="bar",
x="size", y="price", ax=ax,
ci="sd", palette="dark", alpha=.6, height=8
)
<seaborn.axisgrid.FacetGrid at 0x7feb4a147c40>
used_car_df_trimed.cylinders.unique()
array(['6 cylinders', '8 cylinders', '4 cylinders', '5 cylinders',
'10 cylinders', '3 cylinders', 'other', '12 cylinders'],
dtype=object)
used_car_df_trimed.cylinders.value_counts()
6 cylinders 25474 4 cylinders 24914 8 cylinders 22231 10 cylinders 757 5 cylinders 697 other 224 3 cylinders 138 12 cylinders 41 Name: cylinders, dtype: int64
grouped_by_cylinders_df = used_car_df_trimed.groupby(['cylinders'])
grouped_by_cylinders_df.price.describe()
| count | mean | std | min | 25% | 50% | 75% | max | |
|---|---|---|---|---|---|---|---|---|
| cylinders | ||||||||
| 10 cylinders | 757.0 | 15994.978864 | 15111.774061 | 1.0 | 1.00 | 11997.0 | 26990.00 | 55000.0 |
| 12 cylinders | 41.0 | 15947.634146 | 12236.549912 | 1.0 | 9300.00 | 11500.0 | 19900.00 | 52000.0 |
| 3 cylinders | 138.0 | 9828.985507 | 7740.449053 | 1.0 | 4923.75 | 8995.0 | 10998.50 | 34000.0 |
| 4 cylinders | 24914.0 | 9586.935016 | 6833.039030 | 1.0 | 4975.00 | 7976.0 | 12873.25 | 55995.0 |
| 5 cylinders | 697.0 | 7123.466284 | 5150.060638 | 1.0 | 3988.00 | 6000.0 | 8500.00 | 42000.0 |
| 6 cylinders | 25474.0 | 12638.406964 | 10697.934649 | 1.0 | 4988.00 | 8988.0 | 17495.00 | 56000.0 |
| 8 cylinders | 22231.0 | 17838.730107 | 12669.053547 | 1.0 | 7705.50 | 14975.0 | 25990.00 | 56499.0 |
| other | 224.0 | 14847.241071 | 12483.084141 | 1000.0 | 5950.00 | 10000.0 | 20918.50 | 54750.0 |
draw_pie_plot(used_car_df_trimed, 'cylinders', 'Piechart of Cylinders')
draw_multiple_price_boxplots(used_car_df_trimed, 'cylinders', 'Boxplots of price of different clinders', orient="h")
sns.catplot(
data=used_car_df_trimed, kind="bar",
x="cylinders", y="price", ax=ax,
ci="sd", palette="dark", alpha=.6, height=8, aspect=1.3
)
<seaborn.axisgrid.FacetGrid at 0x7feb478c6700>
# create grouped dataframe for plotly
groupby_state_df = used_car_df_trimed.groupby(['state'])
groupby_state_df_count = groupby_state_df.count()
groupby_state_df_count['count'] = groupby_state_df_count['price']
groupby_state_df_count = groupby_state_df_count.reset_index()
groupby_state_df_Mean = groupby_state_df.mean()
groupby_state_df_Mean = groupby_state_df_Mean.reset_index()
groupby_state_df_Mean = groupby_state_df_Mean.loc[:,['state', 'price']].merge(
groupby_state_df_count.loc[:,['state', 'count']], left_on='state', right_on='state')
groupby_state_df_Mean['state'] = groupby_state_df_Mean['state'].str.upper()
# draw U.S. heat map
fig = px.choropleth(groupby_state_df_Mean,
locations="state",
color="count",
hover_name="state", # DataFrame column hover info
locationmode = 'USA-states') # Set to plot as US States
fig.update_layout(
title_text = 'Frequency Distribution',
geo_scope='usa',
)
fig.show()
# draw U.S. heat map
fig = px.choropleth(groupby_state_df_Mean,
locations="state",
color="price",
hover_name="state", # DataFrame column hover info
locationmode = 'USA-states') # Set to plot as US States
fig.update_layout(
title_text = 'Average price of different state',
geo_scope='usa',
)
fig.show()
# create dummies value
def categorical_2_ordered_dummies(df, field, orderList):
df[field] = df[field].astype('category')
df[field] = df[field].cat.reorder_categories(orderList, ordered=True)
df[field] = df[field].cat.codes
## ordered
# conditions
categorical_2_ordered_dummies(used_car_df_trimed, 'condition',
['salvage','fair','good','excellent', 'like new', 'new'])
# size
categorical_2_ordered_dummies(used_car_df_trimed, 'size',
['sub-compact', 'compact', 'mid-size', 'full-size'])
def categorical_one_hot_encoding(df, fields):
prefixs = []
for k in fields:
prefixs.append(k + '_')
return pd.get_dummies(df, columns=fields, prefix=prefixs)
# non-ordered
used_car_df_trimed = categorical_one_hot_encoding(used_car_df_trimed,
['manufacturer','cylinders','fuel','transmission','drive','type','paint_color','state'])
# create train data and test data
y = used_car_df_trimed.iloc[:,0]
X = used_car_df_trimed.iloc[:,1:]
# a simple linear model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.01, random_state=10)
# analyze with statsmodel
model = sm.OLS(y_train, X_train)
results = model.fit()
print(results.summary())
OLS Regression Results
==============================================================================
Dep. Variable: price R-squared: 0.607
Model: OLS Adj. R-squared: 0.606
Method: Least Squares F-statistic: 860.9
Date: Fri, 10 Dec 2021 Prob (F-statistic): 0.00
Time: 18:58:24 Log-Likelihood: -7.5512e+05
No. Observations: 73731 AIC: 1.511e+06
Df Residuals: 73598 BIC: 1.512e+06
Df Model: 132
Covariance Type: nonrobust
=================================================================================================
coef std err t P>|t| [0.025 0.975]
-------------------------------------------------------------------------------------------------
year 239.9513 3.099 77.426 0.000 233.877 246.026
condition 1502.8824 36.222 41.491 0.000 1431.888 1573.877
odometer -0.0830 0.000 -170.769 0.000 -0.084 -0.082
size 850.1660 41.115 20.678 0.000 769.581 930.751
manufacturer__acura -9217.8446 339.574 -27.145 0.000 -9883.407 -8552.282
manufacturer__alfa-romeo -6064.8444 1274.234 -4.760 0.000 -8562.338 -3567.351
manufacturer__aston-martin 1.091e+04 3899.397 2.798 0.005 3267.648 1.86e+04
manufacturer__audi -1.053e+04 320.454 -32.846 0.000 -1.12e+04 -9897.625
manufacturer__bmw -1.124e+04 272.173 -41.301 0.000 -1.18e+04 -1.07e+04
manufacturer__buick -1.095e+04 304.863 -35.926 0.000 -1.15e+04 -1.04e+04
manufacturer__cadillac -1.154e+04 298.048 -38.709 0.000 -1.21e+04 -1.1e+04
manufacturer__chevrolet -1.051e+04 232.282 -45.233 0.000 -1.1e+04 -1.01e+04
manufacturer__chrysler -1.3e+04 291.342 -44.626 0.000 -1.36e+04 -1.24e+04
manufacturer__datsun -2237.7431 1785.923 -1.253 0.210 -5738.146 1262.660
manufacturer__dodge -1.256e+04 263.937 -47.584 0.000 -1.31e+04 -1.2e+04
manufacturer__ferrari -9009.0004 4722.756 -1.908 0.056 -1.83e+04 247.583
manufacturer__fiat -1.497e+04 646.825 -23.150 0.000 -1.62e+04 -1.37e+04
manufacturer__ford -1.074e+04 231.052 -46.463 0.000 -1.12e+04 -1.03e+04
manufacturer__gmc -9397.3091 259.823 -36.168 0.000 -9906.560 -8888.058
manufacturer__harley-davidson -1.528e+04 1215.669 -12.570 0.000 -1.77e+04 -1.29e+04
manufacturer__honda -9293.1097 243.757 -38.124 0.000 -9770.873 -8815.347
manufacturer__hyundai -1.169e+04 279.699 -41.778 0.000 -1.22e+04 -1.11e+04
manufacturer__infiniti -1.084e+04 353.492 -30.652 0.000 -1.15e+04 -1.01e+04
manufacturer__jaguar -1.216e+04 514.635 -23.632 0.000 -1.32e+04 -1.12e+04
manufacturer__jeep -1.027e+04 256.406 -40.037 0.000 -1.08e+04 -9763.158
manufacturer__kia -1.197e+04 292.432 -40.929 0.000 -1.25e+04 -1.14e+04
manufacturer__land rover -6190.4106 2355.334 -2.628 0.009 -1.08e+04 -1573.965
manufacturer__lexus -8075.8780 295.690 -27.312 0.000 -8655.429 -7496.327
manufacturer__lincoln -1.301e+04 347.169 -37.482 0.000 -1.37e+04 -1.23e+04
manufacturer__mazda -1.083e+04 313.636 -34.524 0.000 -1.14e+04 -1.02e+04
manufacturer__mercedes-benz -1.085e+04 273.534 -39.672 0.000 -1.14e+04 -1.03e+04
manufacturer__mercury -1.313e+04 400.895 -32.763 0.000 -1.39e+04 -1.23e+04
manufacturer__mini -1.196e+04 405.563 -29.493 0.000 -1.28e+04 -1.12e+04
manufacturer__mitsubishi -1.18e+04 357.158 -33.032 0.000 -1.25e+04 -1.11e+04
manufacturer__nissan -1.245e+04 249.007 -50.006 0.000 -1.29e+04 -1.2e+04
manufacturer__pontiac -9778.7170 332.542 -29.406 0.000 -1.04e+04 -9126.935
manufacturer__porsche -4191.2101 505.672 -8.288 0.000 -5182.325 -3200.096
manufacturer__ram -9683.2826 267.407 -36.212 0.000 -1.02e+04 -9159.167
manufacturer__rover -8456.7464 420.909 -20.092 0.000 -9281.727 -7631.766
manufacturer__saturn -1.29e+04 423.721 -30.437 0.000 -1.37e+04 -1.21e+04
manufacturer__subaru -1.098e+04 282.573 -38.865 0.000 -1.15e+04 -1.04e+04
manufacturer__tesla 6955.5570 1481.135 4.696 0.000 4052.538 9858.576
manufacturer__toyota -7971.8471 237.389 -33.581 0.000 -8437.128 -7506.567
manufacturer__volkswagen -1.12e+04 276.990 -40.452 0.000 -1.17e+04 -1.07e+04
manufacturer__volvo -9740.3312 377.452 -25.806 0.000 -1.05e+04 -9000.527
cylinders__10 cylinders -4.801e+04 711.856 -67.442 0.000 -4.94e+04 -4.66e+04
cylinders__12 cylinders -4.509e+04 1173.021 -38.437 0.000 -4.74e+04 -4.28e+04
cylinders__3 cylinders -5.197e+04 843.923 -61.583 0.000 -5.36e+04 -5.03e+04
cylinders__4 cylinders -5.012e+04 680.901 -73.613 0.000 -5.15e+04 -4.88e+04
cylinders__5 cylinders -5.034e+04 722.803 -69.644 0.000 -5.18e+04 -4.89e+04
cylinders__6 cylinders -4.847e+04 674.496 -71.854 0.000 -4.98e+04 -4.71e+04
cylinders__8 cylinders -4.61e+04 664.592 -69.372 0.000 -4.74e+04 -4.48e+04
cylinders__other -4.869e+04 807.995 -60.264 0.000 -5.03e+04 -4.71e+04
fuel__diesel -6.995e+04 1055.904 -66.249 0.000 -7.2e+04 -6.79e+04
fuel__electric -7.809e+04 1235.237 -63.219 0.000 -8.05e+04 -7.57e+04
fuel__gas -8.014e+04 1050.670 -76.276 0.000 -8.22e+04 -7.81e+04
fuel__hybrid -7.986e+04 1069.523 -74.664 0.000 -8.2e+04 -7.78e+04
fuel__other -8.075e+04 1204.922 -67.020 0.000 -8.31e+04 -7.84e+04
transmission__automatic -1.267e+05 1737.026 -72.969 0.000 -1.3e+05 -1.23e+05
transmission__manual -1.257e+05 1717.805 -73.190 0.000 -1.29e+05 -1.22e+05
transmission__other -1.363e+05 1746.602 -78.048 0.000 -1.4e+05 -1.33e+05
drive__4wd -1.278e+05 1735.352 -73.625 0.000 -1.31e+05 -1.24e+05
drive__fwd -1.315e+05 1735.700 -75.751 0.000 -1.35e+05 -1.28e+05
drive__rwd -1.295e+05 1719.315 -75.348 0.000 -1.33e+05 -1.26e+05
type__SUV -3.1e+04 414.182 -74.847 0.000 -3.18e+04 -3.02e+04
type__bus -3.184e+04 709.971 -44.848 0.000 -3.32e+04 -3.04e+04
type__convertible -2.817e+04 416.225 -67.674 0.000 -2.9e+04 -2.74e+04
type__coupe -2.85e+04 406.100 -70.185 0.000 -2.93e+04 -2.77e+04
type__hatchback -3.201e+04 426.073 -75.123 0.000 -3.28e+04 -3.12e+04
type__mini-van -3.033e+04 439.623 -68.991 0.000 -3.12e+04 -2.95e+04
type__offroad -2.812e+04 534.148 -52.647 0.000 -2.92e+04 -2.71e+04
type__other -3.12e+04 489.304 -63.773 0.000 -3.22e+04 -3.02e+04
type__pickup -2.818e+04 421.306 -66.883 0.000 -2.9e+04 -2.74e+04
type__sedan -3.241e+04 408.694 -79.312 0.000 -3.32e+04 -3.16e+04
type__truck -2.596e+04 423.835 -61.241 0.000 -2.68e+04 -2.51e+04
type__van -2.84e+04 441.056 -64.388 0.000 -2.93e+04 -2.75e+04
type__wagon -3.267e+04 435.266 -75.059 0.000 -3.35e+04 -3.18e+04
paint_color__black -3.19e+04 449.898 -70.906 0.000 -3.28e+04 -3.1e+04
paint_color__blue -3.293e+04 444.040 -74.155 0.000 -3.38e+04 -3.21e+04
paint_color__brown -3.36e+04 453.531 -74.084 0.000 -3.45e+04 -3.27e+04
paint_color__custom -3.229e+04 456.095 -70.788 0.000 -3.32e+04 -3.14e+04
paint_color__green -3.32e+04 444.793 -74.637 0.000 -3.41e+04 -3.23e+04
paint_color__grey -3.263e+04 450.376 -72.444 0.000 -3.35e+04 -3.17e+04
paint_color__orange -3.031e+04 525.384 -57.684 0.000 -3.13e+04 -2.93e+04
paint_color__purple -3.233e+04 582.144 -55.535 0.000 -3.35e+04 -3.12e+04
paint_color__red -3.296e+04 443.399 -74.326 0.000 -3.38e+04 -3.21e+04
paint_color__silver -3.304e+04 448.249 -73.713 0.000 -3.39e+04 -3.22e+04
paint_color__white -3.203e+04 450.156 -71.150 0.000 -3.29e+04 -3.11e+04
paint_color__yellow -3.159e+04 495.373 -63.775 0.000 -3.26e+04 -3.06e+04
state__ak -4266.0620 294.152 -14.503 0.000 -4842.599 -3689.525
state__al -5540.7224 262.360 -21.119 0.000 -6054.946 -5026.499
state__ar -7102.4594 316.199 -22.462 0.000 -7722.209 -6482.710
state__az -7680.8366 199.366 -38.526 0.000 -8071.593 -7290.080
state__ca -6803.0398 131.379 -51.782 0.000 -7060.542 -6545.538
state__co -7753.8914 191.434 -40.504 0.000 -8129.101 -7378.681
state__ct -9166.8312 249.060 -36.806 0.000 -9654.987 -8678.675
state__dc -9351.5770 323.658 -28.893 0.000 -9985.946 -8717.208
state__de -7276.5284 458.806 -15.860 0.000 -8175.787 -6377.269
state__fl -9456.8875 143.488 -65.907 0.000 -9738.124 -9175.651
state__ga -6863.3713 255.073 -26.907 0.000 -7363.314 -6363.429
state__hi -6978.4496 343.427 -20.320 0.000 -7651.566 -6305.334
state__ia -7959.1104 199.146 -39.966 0.000 -8349.435 -7568.786
state__id -7561.2334 211.129 -35.813 0.000 -7975.045 -7147.422
state__il -8261.7857 199.139 -41.487 0.000 -8652.098 -7871.473
state__in -8483.2935 218.487 -38.827 0.000 -8911.528 -8055.059
state__ks -7352.7986 216.379 -33.981 0.000 -7776.901 -6928.696
state__ky -7147.0331 236.702 -30.194 0.000 -7610.968 -6683.099
state__la -8911.9462 374.848 -23.775 0.000 -9646.647 -8177.245
state__ma -9025.8926 197.759 -45.641 0.000 -9413.499 -8638.286
state__md -8436.5022 293.521 -28.742 0.000 -9011.803 -7861.201
state__me -7372.1532 300.944 -24.497 0.000 -7962.002 -6782.304
state__mi -6684.3443 170.053 -39.307 0.000 -7017.647 -6351.041
state__mn -7666.8938 188.781 -40.613 0.000 -8036.903 -7296.884
state__mo -6994.0856 276.937 -25.255 0.000 -7536.881 -6451.290
state__ms -7274.1706 484.724 -15.007 0.000 -8224.229 -6324.113
state__mt -6448.3448 249.090 -25.888 0.000 -6936.561 -5960.129
state__nc -6446.6247 170.588 -37.791 0.000 -6780.977 -6112.272
state__nd -5878.6157 474.784 -12.382 0.000 -6809.191 -4948.041
state__ne -8011.0099 424.912 -18.853 0.000 -8843.836 -7178.184
state__nh -9096.9750 305.669 -29.761 0.000 -9696.085 -8497.865
state__nj -9184.8167 187.197 -49.065 0.000 -9551.723 -8817.910
state__nm -7758.1761 267.073 -29.049 0.000 -8281.639 -7234.713
state__nv -8735.8223 321.791 -27.147 0.000 -9366.532 -8105.113
state__ny -9218.9603 150.194 -61.380 0.000 -9513.340 -8924.581
state__oh -9476.2785 155.115 -61.092 0.000 -9780.303 -9172.254
state__ok -7055.7005 227.069 -31.073 0.000 -7500.754 -6610.647
state__or -7194.0252 180.202 -39.922 0.000 -7547.221 -6840.829
state__pa -9477.5776 167.531 -56.572 0.000 -9805.937 -9149.218
state__ri -9206.7795 293.952 -31.321 0.000 -9782.925 -8630.634
state__sc -6808.9176 228.535 -29.794 0.000 -7256.846 -6360.989
state__sd -7138.6170 399.377 -17.874 0.000 -7921.395 -6355.839
state__tn -5441.7452 196.996 -27.624 0.000 -5827.856 -5055.635
state__tx -8199.9211 159.074 -51.548 0.000 -8511.706 -7888.136
state__ut -5157.1304 474.717 -10.864 0.000 -6087.574 -4226.687
state__va -7893.5122 183.743 -42.959 0.000 -8253.648 -7533.376
state__vt -7287.6483 246.984 -29.507 0.000 -7771.737 -6803.560
state__wa -7689.4792 234.862 -32.740 0.000 -8149.808 -7229.150
state__wi -7762.1902 175.852 -44.141 0.000 -8106.859 -7417.522
state__wv -8403.8389 583.754 -14.396 0.000 -9547.994 -7259.684
state__wy -6448.6090 583.304 -11.055 0.000 -7591.883 -5305.335
==============================================================================
Omnibus: 10684.901 Durbin-Watson: 1.999
Prob(Omnibus): 0.000 Jarque-Bera (JB): 43522.323
Skew: 0.677 Prob(JB): 0.00
Kurtosis: 6.512 Cond. No. 8.52e+16
==============================================================================
Notes:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.
[2] The smallest eigenvalue is 1.75e-19. This might indicate that there are
strong multicollinearity problems or that the design matrix is singular.
# analyze with sklearn
model2 = sklearn.linear_model.LinearRegression()
model2.fit(X_train, y_train)
LinearRegression()
model2.intercept_
-464966.0345451308
print("Model coefficients:\n")
for i in range(X.shape[1]):
print(X.columns[i], "=", model2.coef_[i].round(4))
Model coefficients: year = 239.9513 condition = 1502.8824 odometer = -0.083 size = 850.166 manufacturer__acura = 264.9167 manufacturer__alfa-romeo = 3417.917 manufacturer__aston-martin = 20393.2137 manufacturer__audi = -1042.9525 manufacturer__bmw = -1758.2653 manufacturer__buick = -1469.5996 manufacturer__cadillac = -2054.3025 manufacturer__chevrolet = -1023.9662 manufacturer__chrysler = -3518.6552 manufacturer__datsun = 7245.0183 manufacturer__dodge = -3076.2826 manufacturer__ferrari = 473.761 manufacturer__fiat = -5491.2699 manufacturer__ford = -1252.6156 manufacturer__gmc = 85.4523 manufacturer__harley-davidson = -5797.9153 manufacturer__honda = 189.6517 manufacturer__hyundai = -2202.4389 manufacturer__infiniti = -1352.3062 manufacturer__jaguar = -2678.9073 manufacturer__jeep = -782.9519 manufacturer__kia = -2486.2695 manufacturer__land rover = 3292.3508 manufacturer__lexus = 1406.8834 manufacturer__lincoln = -3529.7425 manufacturer__mazda = -1345.1674 manufacturer__mercedes-benz = -1368.7761 manufacturer__mercury = -3651.645 manufacturer__mini = -2478.6066 manufacturer__mitsubishi = -2314.7415 manufacturer__nissan = -2969.0143 manufacturer__pontiac = -295.9557 manufacturer__porsche = 5291.5512 manufacturer__ram = -200.5213 manufacturer__rover = 1026.015 manufacturer__saturn = -3414.1556 manufacturer__subaru = -1499.321 manufacturer__tesla = 16438.3184 manufacturer__toyota = 1510.9142 manufacturer__volkswagen = -1722.0486 manufacturer__volvo = -257.5698 cylinders__10 cylinders = 590.1748 cylinders__12 cylinders = 3511.6491 cylinders__3 cylinders = -3372.536 cylinders__4 cylinders = -1524.0708 cylinders__5 cylinders = -1739.7897 cylinders__6 cylinders = 133.8677 cylinders__8 cylinders = 2494.834 cylinders__other = -94.129 fuel__diesel = 7805.5722 fuel__electric = -331.3028 fuel__gas = -2382.4744 fuel__hybrid = -2096.683 fuel__other = -2995.112 transmission__automatic = 2849.524 transmission__manual = 3870.8609 transmission__other = -6720.3849 drive__4wd = 1831.9734 drive__fwd = -1882.8483 drive__rwd = 50.8749 type__SUV = -1093.0979 type__bus = -1933.8977 type__convertible = 1739.6351 type__coupe = 1404.8631 type__hatchback = -2100.8528 type__mini-van = -423.0418 type__offroad = 1785.8287 type__other = -1297.2837 type__pickup = 1728.8453 type__sedan = -2507.1147 type__truck = 3951.213 type__van = 1508.2585 type__wagon = -2763.355 paint_color__black = 498.8909 paint_color__blue = -528.5452 paint_color__brown = -1200.177 paint_color__custom = 113.6113 paint_color__green = -798.4077 paint_color__grey = -227.6451 paint_color__orange = 2093.3638 paint_color__purple = 69.9019 paint_color__red = -556.6126 paint_color__silver = -642.2036 paint_color__white = 371.0353 paint_color__yellow = 806.788 state__ak = 3357.3344 state__al = 2082.6739 state__ar = 520.937 state__az = -57.4403 state__ca = 820.3566 state__co = -130.495 state__ct = -1543.4348 state__dc = -1728.1806 state__de = 346.868 state__fl = -1833.4911 state__ga = 760.0251 state__hi = 644.9468 state__ia = -335.714 state__id = 62.163 state__il = -638.3893 state__in = -859.8971 state__ks = 270.5978 state__ky = 476.3633 state__la = -1288.5498 state__ma = -1402.4963 state__md = -813.1058 state__me = 251.2432 state__mi = 939.0521 state__mn = -43.4974 state__mo = 629.3108 state__ms = 349.2258 state__mt = 1175.0516 state__nc = 1176.7717 state__nd = 1744.7807 state__ne = -387.6136 state__nh = -1473.5786 state__nj = -1561.4204 state__nm = -134.7797 state__nv = -1112.4259 state__ny = -1595.5639 state__oh = -1852.8821 state__ok = 567.6959 state__or = 429.3712 state__pa = -1854.1812 state__ri = -1583.3831 state__sc = 814.4788 state__sd = 484.7794 state__tn = 2181.6512 state__tx = -576.5247 state__ut = 2466.266 state__va = -270.1158 state__vt = 335.7481 state__wa = -66.0828 state__wi = -138.7938 state__wv = -780.4425 state__wy = 1174.7874
model2.score(X_test, y_test)
0.6456403108839056
# compute the SHAP values for the linear model
explainer = shap.Explainer(model2.predict, X_train)
# explainer process will take several minutes, please wait!
shap_values = explainer(X_test)
Permutation explainer: 746it [00:34, 17.21it/s]
shap.plots.bar(shap_values, max_display = 15)
shap.plots.heatmap(shap_values, max_display = 15)
shap.plots.beeswarm(shap_values, max_display = 15)
# explain sample 1
shap.plots.force(shap_values[0])
shap.plots.waterfall(shap_values[0])
used_car_df[used_car_df.index==y_test.index[0]]
| price | year | manufacturer | condition | cylinders | fuel | odometer | transmission | drive | size | type | paint_color | state | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 35697 | 6350 | 2005.0 | audi | excellent | 4 cylinders | gas | 105333.0 | automatic | fwd | compact | convertible | silver | ca |
# explain sample 2
shap.plots.force(shap_values[110])
used_car_df[used_car_df.index==y_test.index[110]]
| price | year | manufacturer | condition | cylinders | fuel | odometer | transmission | drive | size | type | paint_color | state | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 291226 | 6995 | 2011.0 | chevrolet | good | 8 cylinders | gas | 229183.0 | automatic | rwd | mid-size | van | white | oh |
shap.plots.waterfall(shap_values[110])